

### Project: IADB Government Payroll Analytics - Country
### Project leader: Dr Christian Schuster
### Code author (s): Robert Lipiński
### Date last update: run below line
file.info(rstudioapi::getActiveDocumentContext()$path)$mtime


### Script purpose: calculate the target indicators based on the cleaned pay entry-level dataset
### Execution time: ~20 minutes


### Inputs: 
# 1)  /clean/00_database/['Country database' + date (ddmmyyyy) + '(' + [section name] ')' ].csv -> see explanation in the previous script
# 2) /clean/country_limpio_final.qs (output of script 10)



### Outputs:
# 1)  /clean/00_database/['Country database' + date (ddmmyyyy) + '(' + [section name] ')' ].csv -> cleaned versions of the dashboard indicators
# 2) /clean/Country full database [ddmmyyyy].xlsx - full dashboard database with date of creation



#
# SET-UP ----------------------------------------------------------------------------
#


rm(list = ls())

### source global file with packages, functions and global parameters
source(list.files(
  path = dirname(rstudioapi::getActiveDocumentContext()$path),
  pattern = 'global.*\\.R$',
  full.names = T
))


### create copy of the script
file.copy(rstudioapi::getSourceEditorContext()$path,
          gsub('code', 'code/00_ARCHIVE', gsub('\\.R', ' - copy.R', rstudioapi::getSourceEditorContext()$path)),
          overwrite = T, copy.date = T)





#
# READ DATA -------------------------------------------------------------------------------------------------------------------------------------------------
#

country = read_flex(file = file.path('data', 'clean', 'country_full_final'), format = format1)

# # set as data.table if not one already
if(!any(grepl('data.table', class(country)))){setDT(country)}
gc()






# '  -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# MANUAL VARIABLES --------------------------------------------------------------------------------------------------------------------------------------------------------
#

### miscellaneous name cleaning --------------------------------------------------------------------------------------------------------------------------------------------------------

### clean indicator names if typed incorrectly

dta = read_flex(file.path('data', 'clean', '00_database', 'Country database 02102025 (equidad y compresion salarial).csv'))
dta$indicador[grepl('Dentro del mismo rango, cuántos salarios gana un asalariado del 20% superior de la distribución salarial en comparación con uno del 20% inferior', dta$indicador, fixed=T)] = 'Dentro del mismo rango, cuántos salarios gana un asalariado del 20% superior de la distribución salarial en comparación con uno del 20% inferior? (por ano)'
write_flex(dta, file.path('data', 'clean', '00_database', 'Country database 02102025 (equidad y compresion salarial).csv'))


### check for indicators with 0 or too high SD
sd_0 = dta_all %>% 
  filter(grepl('Todas', tipo_organizacion)) %>% 
  filter(grepl('Todas', organizacion)) %>% 
  group_by(indicador, cubertura, tipo_organizacion, organizacion) %>%
  filter(n()>1) %>% # leave only of >1 row present
  summarise(sd = sd_miss(value))

min(sd_0$sd)
sf(sd_0$sd == 0)


unique(sd_0$indicador[is.na(sd_0$sd) | sd_0$sd == 0])

### remove indicadors that should be there
dta = read_flex(file.path('data', 'clean', '00_database', 'Country database 02102025 (ascensos y trayectorias profesionales).csv'))
dim(dta)

dta = dta %>% filter(!indicador %in% c("Porcentaje de empleados que recibieron un ascenso de grado cambiando su organización (por año)",
                                 "Porcentaje de empleados que recibieron un ascenso de grado dentro de su organización (por año)",
                                 "Porcentaje de empleados que recibieron un ascenso de grado en el sector público (por año)" ))

write_flex(dta, file.path('data', 'clean', '00_database', 'Country database 02102025 (ascensos y trayectorias profesionales).csv'))







### + % of expenditure increase due to i) employment number increase vs. ii) average pay increase, by year -----------------------
# run dash_summary() with i = 6
# temp1 = dta_all %>% filter(indicaDor == '% of expenditure increase due to i) employment number increase vs. ii) average pay increase, by year') %>% 
#   arrange(anyo)
# read or update instructions to the newest version before running the loop
read_instructions()

# country[, sum(pago * fte*12), by = .(anyo)]
# country[, sum(fte), by = .(anyo)]
# country[, weighted.mean(pago, fte), by = .(anyo)]
# 
# 5002918408/5585316404 # gastos
# 365036.5/ 428595.3  # empleo
# 1142.10/ 1085.973 # # pago medio

row_gastos = country_instructions$fila[grepl('debido a cambios en el número de empleo', country_instructions$indicador)]
row_empleo = country_instructions$fila[grepl('Crecimiento anual del empleo público', country_instructions$indicador)]
row_pago_medio = min(country_instructions$fila[grepl('Incremento salarial medio anual por empleado', country_instructions$indicador)])

# dash_summary(row_gastos, cubertura_type = 'max_cubertura', grupo_central = 'country') # gastos 
# dash_summary(row_empleo, cubertura_type = 'max_cubertura', grupo_central = 'country') # empleo
# dash_summary(row_pago_medio, cubertura_type = 'max_cubertura', grupo_central = 'country') # pago medio


# apply function - to full vs partial covered orgs, sectors, and individual organizations
temp_empleo = rbindlist(list(
  dash_summary(row_empleo, cubertura_type = 'max_cubertura', grupo_central = 'country'),
  dash_summary(row_empleo, cubertura_type = 'max_comparabilidad', grupo_central = 'country'),
  dash_summary(row_empleo, cubertura_type = 'max_cubertura', grupo_central = c('country', 'sectorial_nombre')),
  dash_summary(row_empleo, cubertura_type = 'max_comparabilidad', grupo_central = c('country', 'sectorial_nombre')),
  dash_summary(row_empleo, cubertura_type = 'max_cubertura', c('country', 'sectorial_nombre', 'entidad_nombre')),
  dash_summary(row_empleo, cubertura_type = 'max_comparabilidad', c('country', 'sectorial_nombre', 'entidad_nombre'))
),
use.names = T, fill = T) %>% 
  mutate(indicador = country_instructions$indicador[grepl('salarios atribuido al cambio en el ', country_instructions$indicador)], 
         group_1 = 'Aumento empleo'
  ) %>% 
  relocate(c(group_1, value), .after = last_col()) %>% 
  distinct

temp_pago = rbindlist(list(
  dash_summary(row_pago_medio, cubertura_type = 'max_cubertura', grupo_central = 'country'),
  dash_summary(row_pago_medio, cubertura_type = 'max_comparabilidad', grupo_central = 'country'),
  dash_summary(row_pago_medio, cubertura_type = 'max_cubertura', grupo_central = c('country', 'sectorial_nombre')),
  dash_summary(row_pago_medio, cubertura_type = 'max_comparabilidad', grupo_central = c('country', 'sectorial_nombre')),
  dash_summary(row_pago_medio, cubertura_type = 'max_cubertura', c('country', 'sectorial_nombre', 'entidad_nombre')),
  dash_summary(row_pago_medio, cubertura_type = 'max_comparabilidad', c('country', 'sectorial_nombre', 'entidad_nombre'))
),
 use.names = T, fill = T) %>% 
  mutate(indicador = country_instructions$indicador[grepl('salarios atribuido al cambio en el ', country_instructions$indicador)], 
         group_1 = 'Aumento salario medio'
         ) %>% 
  relocate(c(group_1, value), .after = last_col()) %>% 
  distinct



temp = rbindlist(list(temp_empleo, temp_pago), use.names = T, fill = T) %>% 
  mutate(across(c(cubertura, sectorial_nombre, entidad_nombre), ~replace_na(.x, "Todo"))) %>% 
  group_by(cubertura, sectorial_nombre, entidad_nombre, group_1) %>% 
  arrange(anyo) %>% 
  mutate(value2 = value - lag(value, default = NA)) %>% 
  filter(anyo != min(anyo))
  



#### <> add to the correct (sub-)file ---------------------------------------------------------------
seccion1 = 'gastos salariales'


dta = read_flex(file.path('data', 'clean', '00_database', paste0('Country database ', date1 ,' (gastos salariales totales).csv')))

dta = rbindlist(list(dta %>% filter(indicador != 'Porcentaje de aumento anual del gasto en salarios atribuido al cambio en el número de empleos vs. el salario medio (anual)'),
                         temp), use.names = T, fill = T)


write_flex(dta, file.path('data', 'clean', '00_database', paste0('Country database ', date1 ,' (gastos salariales totales).csv')))






### ' ------------------------------------------------------------------------------------------------------------------------------------

if(!exists('country')){country = read_flex(file = file.path('data', 'clean', 'country_full_final'), format = format1)}
gc()



### + modelling gender gap -----------------------------------------------------------------------------------------------------

### feols() from fixest should be equivalent, but faster than base lm()

### take annual pay values
country_gap_annual = country[, .(pago_annual = fsum(pago_bruto)), by = .(person_id, anyo, genero, 
                                                                     entidad_nombre, sectorial_nombre, cubertura,
                                                                     contract_type_dummy, grado, region)]

### leave only orgs with >min_gender men and women each
country_gap_annual = country_gap_annual[,  `:=`(n_hombre = uniqueN(person_id[genero == 'hombre']),
                               n_mujer  = uniqueN(person_id[genero == 'mujer'])),
                         by = .(anyo, entidad_nombre)]#[n_hombre>20 & n_mujer>20]

min_gender = 50
pr(country_gap_annual$n_hombre > min_gender & country_gap_annual$n_mujer > min_gender)
country_gap_annual[n_hombre>min_gender & n_mujer>min_gender]


preds <- c("genero","region","contract_type_dummy","grado")


pr(country$sectorial_nombre)
rm(country)
gc()


for(sector1 in unique(country_gap_annual$sectorial_nombre)){
  
  if(is.na(sector1)){next}
  print(sector1)
  
  ### do regression
  res <- country_gap_annual[
    sectorial_nombre == sector1
    , {
      
      DT = .SD
      
      # predictors you want, keep only those with ≥2 levels in THIS group
      
      use <- ..preds[sapply(DT[, ..preds], uniqueN) > 1L]
      
      # if no usable predictors (or no gender variation), return NA
      if (!("genero" %in% use)) return(list(x1 = NA_real_))
      fml <- as.formula(paste("log(pago_annual) ~", paste(use, collapse = " + ")))
      
      fit <- try(feols(fml, data = DT, vcov = ~ person_id), silent = TRUE)
      if (inherits(fit, "try-error")) return(list(x1 = NA_real_))
      
      # extract the genero coefficient if present; else NA
      cn <- names(coef(fit))
      gcoef <- cn[grepl("^genero", cn)]
      list(value = if (length(gcoef)) coef(fit)[gcoef][1] else NA_real_)
    },
    by = .(anyo, sectorial_nombre, entidad_nombre)
  ]
  
  assign(paste0('res_', gsub(' ', '_', clean_text(sector1))),
         res)
  
}


### check if the result the same for selected orgs when using simple OLS

# scatterplot
ggplot(res, aes(x = reorder(entidad_nombre, value),
                y = 1 - value,
                group = entidad_nombre))+
  geom_bar(stat = 'identity')+
  geom_hline(aes(yintercept = 1), linetype='dashed')+
  facet_wrap(~anyo, scales='free')
  


### clean reg results --------------------------------------------------------------
### add right cubertura and sectors for each entidad
temp = unique(country[, .(cubertura, sectorial_nombre, entidad_nombre)])
brecha_entidad = inner_join(brecha_entidad, temp)

temp = unique(country[, .(cubertura, sectorial_nombre)])
brecha_sectorial = inner_join(brecha_sectorial, temp)


### combine the files
brecha_dta = rbindlist(list(brecha_cubertura, brecha_sectorial, brecha_entidad),
                       use.names = T, fill = T) %>% 
             rename(value=x1) %>% 
             mutate(cubertura = ifelse(cubertura == 'completo',
                                       'Maximizar cubertura',
                                       'Maximizar comparabilidad a lo largo del tiempo'))


### add average wage to express diff as % 
row_pago = country_instructions$fila[grepl('Remuneración media bruta ',
                                             country_instructions$indicador)]
temp_pago = rbindlist(list(
  dash_summary(row_pago, cubertura_type = 'max_cubertura', grupo_central = 'country'),
  dash_summary(row_pago, cubertura_type = 'max_comparabilidad', grupo_central = 'country'),
  dash_summary(row_pago, cubertura_type = 'max_cubertura', grupo_central = c('country', 'sectorial_nombre')),
  dash_summary(row_pago, cubertura_type = 'max_comparabilidad', grupo_central = c('country', 'sectorial_nombre')),
  dash_summary(row_pago, cubertura_type = 'max_cubertura', c('country', 'sectorial_nombre', 'entidad_nombre')),
  dash_summary(row_pago, cubertura_type = 'max_comparabilidad', c('country', 'sectorial_nombre', 'entidad_nombre'))
),
use.names = T, fill = T) %>% 
  rename(value2=value) %>% 
  distinct





# ' -------------------------------------------------------------------------------------------------------------------------
# > Make a single file -------------------------------------------------------------------------------------------------------------------------


### > define date -----------------------------------------------------------------------------------------------------------------
### define date for which we will be manually adding variable (in ddmmyyyy format)
### it should be the date on which the latest full iteration of the database was created,
### which should also be specified in the names of relevant .csv files in '00_database' containing
### indicators from each individual dashboard section
### unless otherwise specify, use latest date from the files in the '00_database' folder

csv_files = list.files(path = file.path('data', 'clean', '00_database'), pattern = "\\.csv$", full.names = FALSE)
dates1 = regmatches(csv_files, gregexpr("\\b\\d{8}\\b", csv_files)) %>% unique
date1 = format(as.Date(max_miss(dmy(dates1))), "%d%m%Y")

### over-write the date manually 
# date1 = '02102025'


### list all the files
database_files_all = list.files(file.path('data', 'clean', '00_database'),
                                pattern = paste0(date1, '.*\\)\\.csv'),
                                full.names = T) %>% print

### read and row-bind into 'dta_all'
dta_all = rbindlist(lapply(database_files_all, function(f) fread(f, encoding='UTF-8')), use.names = T, fill=T)

dta_all$organizacion[dta_all$organizacion == 'NANA'] = 'Todas las organizaciones'

dta_all = dta_all %>% select(-c(any_of(c('sectorial_nombre','entidad_nombre', 'value2'))))



### compare to what's in the instructions file
read_instructions()
country_instructions = country_instructions %>% filter(fila<900)
fdistinct(country_instructions$indicador)
setdiff(country_instructions$indicador, unique(dta_all$indicador)) # the output can be passed to the row list below
setdiff(unique(dta_all$indicador), country_instructions$indicador) # the output can be passed to the row list below
country_instructions$fila[!(country_instructions$indicador %in% unique(dta_all$indicador))]



### full database
fwrite( dta_all %>% distinct, 
        file.path(main_dir, 'data', 'clean',
                  paste0("Country full database (", date1 , ").csv")), encoding = 'UTF-8')


### government-wide values only 
temp =  dta_all %>% filter(tipo_organizacion == 'Todas las organizaciones' & organizacion == 'Todas las organizaciones') %>% distinct

fwrite(temp , 
       file.path(main_dir, 'data', 'clean', paste0("Country todo el gobierno (", date1 , ").csv")), encoding = 'UTF-8')



### specifically for Direccion Nacional del Servicio Civil
temp =  dta_all %>% filter(organizacion == 'Direccion Nacional del Servicio Civil') %>% distinct
fwrite(temp , 
       file.path(main_dir, 'data', 'clean', paste0("Country DNSC (", date1 , ").csv")), encoding = 'UTF-8')






# ' ---------------------------------------------------------------------------------------------------
# END OF CODE ------------------------------------------------------------------------------------------
#
